import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from xgboost import XGBRegressor
from pygam import LinearGAM, s, l, f
from dsc80_utils import * # Feel free to uncomment and use this.
Step 1: Introduction¶
# load data
raw_df = pd.read_excel('data/outage.xlsx')
clean_df = pd.read_csv('data/outages_clean_df.csv')
clean_df['outage_start_datetime'] = pd.to_datetime(clean_df['outage_start_datetime'])
clean_df['outage_restoration_datetime'] = pd.to_datetime(clean_df['outage_restoration_datetime'])
df = pd.read_excel('data/outage.xlsx', header=5)
df
| variables | OBS | YEAR | MONTH | ... | AREAPCT_UC | PCT_LAND | PCT_WATER_TOT | PCT_WATER_INLAND | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Units | NaN | NaN | NaN | ... | % | % | % | % |
| 1 | NaN | 1.0 | 2011.0 | 7.0 | ... | 0.6 | 91.59 | 8.41 | 5.48 |
| 2 | NaN | 2.0 | 2014.0 | 5.0 | ... | 0.6 | 91.59 | 8.41 | 5.48 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1532 | NaN | 1532.0 | 2009.0 | 8.0 | ... | 0.15 | 98.31 | 1.69 | 1.69 |
| 1533 | NaN | 1533.0 | 2009.0 | 8.0 | ... | 0.15 | 98.31 | 1.69 | 1.69 |
| 1534 | NaN | 1534.0 | 2000.0 | NaN | ... | 0.02 | 85.76 | 14.24 | 2.9 |
1535 rows × 57 columns
This dataset contains detailed information about major U.S. power outages, combining outage event metadata, climate indicators, economic variables, and state-level demographic information. Each row represents a single outage event. Some variables tied to location and identification include year, month, state, postal code, North American Electric Reliability Corporation region, and NOAA-defined climate region. Of climate and weather indicators, we have climate anomaly classification, climate category, and hurricane names (if applicable). For specific outages, we have outage start and end times, restoratation time, and outage duration. We also have outage cause and descriptions of each cause category. We have information on electrical demand lost, as well as how many customers were affected per outage. Information on electricity sales by sector and sector share of total sales is also available, in addition to counts of customers by sector and sector share of total customers. Economic indicators such as state per capita gdp, US per capita gdp, relative state gdp, year over year percentage change in state gdp, utility gdp, and utility sector contributions to gdp are also included, as well as state populations, populations in urban and urban cluster areas, and overall population density. We also have information on land and water composition and percentages per state.
Some interesting questions I want to explore is how do certain factors affect outage duration, such as weather effects, state, location of outage, etc. Another one that seems interesting is what states are more prone to outages and maybe exploring why, like what environmental or economic factors contribute. Maybe exploring what years have more power outages (either per state or total idk) and looking into why those years have more or less outages.
Step 2: Data Cleaning and Exploratory Data Analysis¶
# get rid of formatting rows
df = (
df.drop(columns='variables')
.reset_index(drop=True)
.set_index('OBS')
)
# clean col names
df.columns = (
df.columns
.str.strip()
.str.lower()
.str.replace('.', '_', regex=False)
.str.replace(' ', '_')
)
df
| year | month | u_s__state | postal_code | ... | areapct_uc | pct_land | pct_water_tot | pct_water_inland | |
|---|---|---|---|---|---|---|---|---|---|
| OBS | |||||||||
| NaN | NaN | NaN | NaN | NaN | ... | % | % | % | % |
| 1.0 | 2011.0 | 7.0 | Minnesota | MN | ... | 0.6 | 91.59 | 8.41 | 5.48 |
| 2.0 | 2014.0 | 5.0 | Minnesota | MN | ... | 0.6 | 91.59 | 8.41 | 5.48 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1532.0 | 2009.0 | 8.0 | South Dakota | SD | ... | 0.15 | 98.31 | 1.69 | 1.69 |
| 1533.0 | 2009.0 | 8.0 | South Dakota | SD | ... | 0.15 | 98.31 | 1.69 | 1.69 |
| 1534.0 | 2000.0 | NaN | Alaska | AK | ... | 0.02 | 85.76 | 14.24 | 2.9 |
1535 rows × 55 columns
# 'u_s__state' was too weird
df = df.rename(columns={'u_s__state': 'us_state'})
columns_to_keep = [
'year',
'month',
'us_state',
'nerc_region',
'climate_region',
'climate_category',
'anomaly_level',
'outage_start_date',
'outage_start_time',
'outage_restoration_date',
'outage_restoration_time',
'outage_duration',
'cause_category',
'demand_loss_mw',
'customers_affected',
'total_price',
'total_sales',
'total_customers',
'poppct_urban',
'popden_urban',
'areapct_urban',
'util_realgsp',
'total_realgsp',
'util_contri',
'population'
]
# Create a new DataFrame containing only the selected columns
df = df.loc[:, columns_to_keep]
print(f"DataFrame now contains {len(df.columns)} columns.")
DataFrame now contains 25 columns.
units_row = df.iloc[0] # second row = units
units_dict = {
col: (units_row[col] if pd.notna(units_row[col]) else np.nan)
for col in df.columns
}
units_dict
{'year': nan,
'month': nan,
'us_state': nan,
'nerc_region': nan,
'climate_region': nan,
'climate_category': nan,
'anomaly_level': 'numeric',
'outage_start_date': 'Day of the week, Month Day, Year',
'outage_start_time': 'Hour:Minute:Second (AM / PM)',
'outage_restoration_date': 'Day of the week, Month Day, Year',
'outage_restoration_time': 'Hour:Minute:Second (AM / PM)',
'outage_duration': 'mins',
'cause_category': nan,
'demand_loss_mw': 'Megawatt',
'customers_affected': nan,
'total_price': 'cents / kilowatt-hour',
'total_sales': 'Megawatt-hour',
'total_customers': nan,
'poppct_urban': '%',
'popden_urban': 'persons per square mile',
'areapct_urban': '%',
'util_realgsp': 'USD',
'total_realgsp': 'USD',
'util_contri': '%',
'population': nan}
# drop units after alr have them
df.drop(df.index[0], inplace=True)
# numeric cols to numeric
numeric_cols = df.select_dtypes(include="object").columns
# try to coerce to numeric
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="ignore")
# drop duplicate rows
df = df.drop_duplicates()
df
C:\Users\Jonat\AppData\Local\Temp\ipykernel_3384\1370707334.py:8: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead
| year | month | us_state | nerc_region | ... | util_realgsp | total_realgsp | util_contri | population | |
|---|---|---|---|---|---|---|---|---|---|
| OBS | |||||||||
| 1.0 | 2011.0 | 7.0 | Minnesota | MRO | ... | 4802 | 274182 | 1.75 | 5.35e+06 |
| 2.0 | 2014.0 | 5.0 | Minnesota | MRO | ... | 5226 | 291955 | 1.79 | 5.46e+06 |
| 3.0 | 2010.0 | 10.0 | Minnesota | MRO | ... | 4571 | 267895 | 1.71 | 5.31e+06 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1532.0 | 2009.0 | 8.0 | South Dakota | RFC | ... | 606 | 36504 | 1.66 | 8.07e+05 |
| 1533.0 | 2009.0 | 8.0 | South Dakota | MRO | ... | 606 | 36504 | 1.66 | 8.07e+05 |
| 1534.0 | 2000.0 | NaN | Alaska | ASCC | ... | 724 | 36046 | 2.01 | 6.28e+05 |
1525 rows × 25 columns
Cleaning some of the date and time stuff. Parsed outage_start_date and outage_restoration_date into DateTime, combined with start and restore times, then dropped because they were redundant. Also created changed outage_duration into calculated by subtracting restoration DateTime from start DateTime.
# 1. Parse the dates
df['outage_start_date'] = pd.to_datetime(
df['outage_start_date'],
format="%A, %B %d, %Y",
errors='coerce'
)
df['outage_restoration_date'] = pd.to_datetime(
df['outage_restoration_date'],
format="%A, %B %d, %Y",
errors='coerce'
)
# not parsing the time columns — they are already datetime.time
# df['outage_start_time'] already datetime.time
# df['outage_restoration_time'] already datetime.time
# 2. Combine date + time into full datetime
df['outage_start_datetime'] = df.apply(
lambda r: datetime.combine(r['outage_start_date'], r['outage_start_time'])
if pd.notnull(r['outage_start_date']) and r['outage_start_time'] is not None
else pd.NaT,
axis=1
)
df['outage_restoration_datetime'] = df.apply(
lambda r: datetime.combine(r['outage_restoration_date'], r['outage_restoration_time'])
if pd.notnull(r['outage_restoration_date']) and r['outage_restoration_time'] is not None
else pd.NaT,
axis=1
)
outage_cols = [
'outage_restoration_date',
'outage_duration',
'outage_restoration_datetime',
'outage_restoration_time'
]
print(f'before: {df.shape}')
# drop rows where all 4 columns are NaN (cant fill in info for duration, restoration if all 4 missing)?
df = df.dropna(subset=outage_cols, how='all')
# seems like 58 rows dropped
print(f'after: {df.shape}')
# Dropping outage_start_date and outage_start_time bc we already have as DateTime units
df = df.drop(columns=['outage_start_date', 'outage_start_time', 'outage_restoration_date', 'outage_restoration_time'])
before: (1525, 27) after: (1467, 27)
# 1. Missing times
time_cols = ['outage_start_datetime', 'outage_restoration_datetime', 'outage_duration']
print("Missing values in time columns:")
print(df[time_cols].isna().sum())
# 2. Check for "Hidden" missing values (Zeros) in impact columns
# It is unlikely that an outage affected 0 customers or lost 0 MW of demand.
zero_customers = df['customers_affected'].eq(0).sum()
zero_demand = df['demand_loss_mw'].eq(0).sum()
print(f"\nEntries with 0 Customers Affected: {zero_customers}")
print(f"Entries with 0 Demand Loss (MW): {zero_demand}")
Missing values in time columns: outage_start_datetime 0 outage_restoration_datetime 0 outage_duration 0 dtype: int64 Entries with 0 Customers Affected: 199 Entries with 0 Demand Loss (MW): 183
No missing times after dropping all with missing values for all time fields (start, end, duration), but still 0 is kind of suspicious for customers affected and demand lost. Usually power outages affect at least someone or there is some power demand lost somewhere.
# Check which causes represent the rows with 0 customers affected
zero_impact_rows = df[df['customers_affected'] == 0]
print("Causes of outages with 0 customers affected:")
print(zero_impact_rows['cause_category'].value_counts())
Causes of outages with 0 customers affected: cause_category intentional attack 171 public appeal 9 severe weather 9 islanding 5 fuel supply emergency 4 equipment failure 1 Name: count, dtype: int64
Did a bit more research, federal requirement to report intentional attacks as outages, even if there weren't any people affected. They still could be like security incidents or other vandalism or related attacks that didn't succeed in cutting power, so a value of 0 makes sense here. Other causes (such as severe weather) might not make as much sense, so we should examine those further. Stuff like public appeal is probably something that had other reasons, like conserving power? Likely only actual missing data is in severe weather and equipment failure, with islanding, fuel supply emergency, and others probably being caused by other conditions than missing data.
# CONDITIONAL REPLACEMENT: Replace 0 with NaN only for highly suspicious causes
suspicious_causes = ['severe weather', 'equipment failure']
# Mask for rows where customers_affected is 0 AND the cause is suspicious
suspicious_zero_mask = (df['customers_affected'] == 0) & \
(df['cause_category'].isin(suspicious_causes))
# Apply replacement to both customers_affected and demand_loss_mw
df.loc[suspicious_zero_mask, 'customers_affected'] = np.nan
df.loc[suspicious_zero_mask, 'demand_loss_mw'] = np.nan
print(f"Replaced {suspicious_zero_mask.sum()} suspicious zero-impact rows with NaN.")
# VERIFICATION
print("\nFinal Missing Value Check:")
print(df[['customers_affected', 'demand_loss_mw']].isna().sum())
Replaced 10 suspicious zero-impact rows with NaN. Final Missing Value Check: customers_affected 424 demand_loss_mw 677 dtype: int64
# get rid of quotes if we want to impute, for now decided against it. handled in pipeline?
"""
# 1. Impute customers_affected NaNs using the median customers affected for that cause category
# This uses the MAR assumption: missingness depends on the observed cause_category.
df['customers_affected'] = df['customers_affected'].fillna(
df.groupby('cause_category')['customers_affected'].transform('median')
)
# 2. Impute demand_loss_mw NaNs using the median demand loss for that cause category
df['demand_loss_mw'] = df['demand_loss_mw'].fillna(
df.groupby('cause_category')['demand_loss_mw'].transform('median')
)
# Final verification check
print("\nFinal NaN check for imputed columns (should all be 0):")
print(df[['customers_affected', 'demand_loss_mw']].isna().sum())
"""
'\n# 1. Impute customers_affected NaNs using the median customers affected for that cause category\n# This uses the MAR assumption: missingness depends on the observed cause_category.\ndf[\'customers_affected\'] = df[\'customers_affected\'].fillna(\n df.groupby(\'cause_category\')[\'customers_affected\'].transform(\'median\')\n)\n\n# 2. Impute demand_loss_mw NaNs using the median demand loss for that cause category\ndf[\'demand_loss_mw\'] = df[\'demand_loss_mw\'].fillna(\n df.groupby(\'cause_category\')[\'demand_loss_mw\'].transform(\'median\')\n)\n\n# Final verification check\nprint("\nFinal NaN check for imputed columns (should all be 0):")\nprint(df[[\'customers_affected\', \'demand_loss_mw\']].isna().sum())\n'
# Calculate duration from your datetime columns (in minutes)
calculated_duration = (df['outage_restoration_datetime'] - df['outage_start_datetime']).dt.total_seconds() / 60
# Compare with the pre-existing 'outage_duration' column
# We allow for a small margin of error (e.g., 1-5 minutes) due to rounding
diff = abs(df['outage_duration'] - calculated_duration)
inconsistent_rows = diff[diff > 5] # Difference greater than 5 minutes
print(f"\nNumber of rows with inconsistent durations: {len(inconsistent_rows)}")
Number of rows with inconsistent durations: 31
# View the inconsistent rows to check durations
df.loc[inconsistent_rows.index, ['outage_start_datetime', 'outage_restoration_datetime', 'outage_duration']]
| outage_start_datetime | outage_restoration_datetime | outage_duration | |
|---|---|---|---|
| OBS | |||
| 54.0 | 2014-01-24 00:00:00 | 2014-04-09 11:53:00 | 108653.0 |
| 64.0 | 2014-03-04 09:06:00 | 2014-03-17 09:06:00 | 18660.0 |
| 72.0 | 2012-10-29 00:00:00 | 2012-11-09 23:59:00 | 17339.0 |
| ... | ... | ... | ... |
| 990.0 | 2014-02-07 07:00:00 | 2014-03-21 08:00:00 | 60480.0 |
| 1208.0 | 2016-03-03 11:00:00 | 2016-04-06 19:47:00 | 49427.0 |
| 1405.0 | 2010-03-13 12:00:00 | 2010-03-15 20:05:00 | 3305.0 |
31 rows × 3 columns
Maybe some of our start data and restoration data are estimates?
# Can overwrite durations with info from start and ends
# Recalculate duration from datetimes, in minutes
df['outage_duration'] = calculated_duration
# Check if any outage ends before it starts
negative_duration = df[df['outage_restoration_datetime'] < df['outage_start_datetime']]
print(f"Rows with negative duration (restoration before start): {len(negative_duration)}")
Rows with negative duration (restoration before start): 0
# categories seem consistently named (no issues with slightly different spelling or capitalization?)
string_cols = df.select_dtypes(include='object').columns
for col in string_cols:
print(f"{col} value counts:\n{df[col].value_counts(dropna=False)}\n")
us_state value counts:
us_state
California 197
Texas 121
Michigan 95
...
Montana 3
South Dakota 2
North Dakota 1
Name: count, Length: 49, dtype: int64
nerc_region value counts:
nerc_region
WECC 422
RFC 415
SERC 191
...
FRCC, SERC 1
HI 1
PR 1
Name: count, Length: 13, dtype: int64
climate_region value counts:
climate_region
Northeast 343
South 215
West 204
...
Southwest 87
West North Central 16
NaN 5
Name: count, Length: 10, dtype: int64
climate_category value counts:
climate_category
normal 726
cold 459
warm 282
Name: count, dtype: int64
cause_category value counts:
cause_category
severe weather 741
intentional attack 402
system operability disruption 123
public appeal 65
equipment failure 55
islanding 44
fuel supply emergency 37
Name: count, dtype: int64
Overall Missingness
# stuff with None in entry
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
print(df.isna().sum()[df.isna().sum() > 0].sort_values(ascending=False))
demand_loss_mw 677 customers_affected 424 total_price 12 total_sales 12 climate_region 5 dtype: int64
Can impute climate_region for outages where its missing, based on the mode (most common) values of climate_region in its matching state. 'Unknown' is used if there are no available values for climate_region.
# --- 1. Calculate the modes ---
state_region_mode = df.groupby('us_state')['climate_region'].apply(
lambda x: x.mode()[0] if not x.mode().empty else None
)
# --- 2. Imputation ---
# Impute NaN values in 'climate_region' by mapping the state's mode.
# df['us_state'].map(state_region_mode) creates a Series where the index matches df,
# and the value is the mode of that state.
df['climate_region'] = df['climate_region'].fillna(
df['us_state'].map(state_region_mode)
)
# --- 3. Handle any remaining NaNs (if a state had no mode) ---
df['climate_region'] = df['climate_region'].fillna('Unknown')
# List of all economic columns that have missing values (price/sales)
economic_cols = [
'total_price',
'total_sales'
]
# Impute Price/Sales Columns using the median of the state
for col in economic_cols:
df[col] = df[col].fillna(
df.groupby('us_state')[col].transform('median')
)
# final missingness check
missing = pd.DataFrame({
'missing_count': df.isna().sum(),
'missing_percent': (df.isna().sum() / len(df)) * 100
})
pd.set_option('display.max_rows', None)
missing
| missing_count | missing_percent | |
|---|---|---|
| year | 0 | 0.00 |
| month | 0 | 0.00 |
| us_state | 0 | 0.00 |
| nerc_region | 0 | 0.00 |
| climate_region | 0 | 0.00 |
| climate_category | 0 | 0.00 |
| anomaly_level | 0 | 0.00 |
| outage_duration | 0 | 0.00 |
| cause_category | 0 | 0.00 |
| demand_loss_mw | 677 | 46.15 |
| customers_affected | 424 | 28.90 |
| total_price | 0 | 0.00 |
| total_sales | 0 | 0.00 |
| total_customers | 0 | 0.00 |
| poppct_urban | 0 | 0.00 |
| popden_urban | 0 | 0.00 |
| areapct_urban | 0 | 0.00 |
| util_realgsp | 0 | 0.00 |
| total_realgsp | 0 | 0.00 |
| util_contri | 0 | 0.00 |
| population | 0 | 0.00 |
| outage_start_datetime | 0 | 0.00 |
| outage_restoration_datetime | 0 | 0.00 |
pd.reset_option('display.max_rows')
# Save the DataFrame to a CSV file
df.to_csv('data/outages_clean_df.csv', index=False)
Plotting number of unique outages per state over time
# Make sure year column is numeric
df['year'] = pd.to_numeric(df['year'], errors='coerce')
# Top 5 states by total outages
top5_states = df['us_state'].value_counts().head(5).index
df_top5 = df[df['us_state'].isin(top5_states)]
# Aggregate outages per year
total_yearly_counts = df.groupby(['year']).size().reset_index(name='outages')
fig_annual_outages_line = px.line(total_yearly_counts,
x='year',
y='outages',
markers=True,
title='Annual Power Outages')
fig_annual_outages_line.update_layout(xaxis=dict(dtick=1)) # show every year on x-axis
fig_annual_outages_line.show()
fig_annual_outages_line.write_html("plots/annual_outages_line.html")
# Aggregate outages per year
outages_by_cause = df.groupby(['cause_category']).size().reset_index(name='Total Outages')
outages_by_cause = outages_by_cause.sort_values(by='Total Outages', ascending=True)
fig_outages_causes= px.bar(
outages_by_cause,
x='Total Outages',
y='cause_category',
orientation='h', # Horizontal bar chart for better label readability
title='Total Number of Power Outages by Cause Category',
labels={
'Total Outages': 'Number of Outages',
'cause_category': 'Outage Cause'
}
)
fig_outages_causes.update_layout(yaxis={'categoryorder':'total ascending'}) # Ensures the bars are plotted from smallest to largest
fig_outages_causes.show()
fig_outages_causes.write_html("plots/outages_causes.html")
# Aggregate outages per year and state
yearly_counts = df_top5.groupby(['year', 'us_state']).size().reset_index(name='outages')
fig_line_top_5_state_outages_total = px.line(yearly_counts,
x='year',
y='outages',
color='us_state',
markers=True,
title='Annual Power Outages for Top 5 States')
fig_line_top_5_state_outages_total.update_layout(xaxis=dict(dtick=1)) # show every year on x-axis
fig_line_top_5_state_outages_total.show()
#fig_line_top_5_state_outages_total.write_image("top_5_state_outages_total.png")
# Total outages per top 5 state
total_counts = df_top5['us_state'].value_counts().reset_index()
total_counts.columns = ['us_state', 'total_outages']
fig_bar_top_5_state_outages_yearly = px.bar(total_counts,
x='us_state',
y='total_outages',
color='us_state',
title='Total Power Outages for Top 5 States')
fig_bar_top_5_state_outages_yearly.show()
#fig_bar_top_5_state_outages_yearly.write_image("top_5_state_outages_yearly.png")
# 1. Define the dictionary mapping full names to abbreviations
us_state_to_abbrev = {
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
"Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
"Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
"Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
"Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO",
"Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
"New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
"Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
"South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT",
"Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
"District of Columbia": "DC"
}
# 2. Create a new column 'state_code' by mapping the 'us_state' column
# Replace 'df' with your actual DataFrame name
df['state_code'] = df['us_state'].map(us_state_to_abbrev)
# Check if any states failed to map (returns True if there are NaN values)
print("Unmapped states:", df[df['state_code'].isna()]['us_state'].unique())
Unmapped states: []
# Aggregating the data to count outages per state
outage_counts = df.groupby('state_code').size().reset_index(name='Outage_Count')
fig_outages_by_state = px.choropleth(
outage_counts,
locations='state_code', # Use the new 2-letter code column
locationmode="USA-states",
color='Outage_Count',
scope="usa",
color_continuous_scale="reds",
title='Total Power Outages by State'
)
fig_outages_by_state.show()
fig_outages_by_state.write_html("plots/outages_by_state.html")
# Create a normalized metric: Outages per 1 Million People
# We use the 'population' column
df_normalized = df.copy()
# Drop rows where population is missing to avoid errors
df_normalized = df_normalized.dropna(subset=['population'])
# Aggregate by state, taking the mean population
state_stats = df_normalized.groupby('us_state').agg({
'population': 'mean',
'year': 'count' # Using 'year' just to count the number of rows (outages)
}).rename(columns={'year': 'total_outages'})
# Calculate the metric
state_stats['outages_per_million'] = (state_stats['total_outages'] / state_stats['population']) * 1_000_000
state_stats = state_stats.sort_values('outages_per_million', ascending=False).head(10).reset_index()
# Plot
fig_norm = px.bar(state_stats,
x='us_state',
y='outages_per_million',
title='Power Outages per 1 Million People (Top 10 States)',
labels={'outages_per_million': 'Outages / 1M People'},
color='outages_per_million',
color_continuous_scale='Reds')
fig_norm.show()
Delaware has quite a few outages per capita, might explore later.
# Make sure year column is numeric
df['year'] = pd.to_numeric(df['year'], errors='coerce')
# Count outages per year per cause
yearly_cause = (
df.groupby(['year', 'cause_category'])
.size()
.reset_index(name='total_outages')
)
fig_line_total_outages_by_cause_per_year = px.line(yearly_cause,
x='year',
y='total_outages',
color='cause_category',
markers=True,
title='Total Power Outages Across All States by Cause per Year')
fig_line_total_outages_by_cause_per_year.update_layout(xaxis=dict(dtick=1)) # show every year
fig_line_total_outages_by_cause_per_year.show()
#fig_line_total_outages_by_cause_per_year.write_image("total_outages_by_cause_per_year.png")
# Violin Plot with Log Scale
df_plot = df.dropna(subset=['outage_duration', 'cause_category'])
fig_vio_duration_by_cause = px.violin(df_plot,
x='cause_category',
y='outage_duration',
color='cause_category',
box=True,
points='all',
log_y=True, # <--- This is the key change!
title='Distribution of Outage Duration by Cause (Log Scale)')
fig_vio_duration_by_cause.update_layout(
xaxis_title='Cause Category',
yaxis_title='Outage Duration (minutes, log scale)',
showlegend=False
)
fig_vio_duration_by_cause.show()
fig_vio_duration_by_cause.write_html("plots/vio_duration_by_cause.html")
Plotting outage durations
# --- 1. Create the Log-Transformed Column ---
# Imputed suspicious 0s with the median, there should be no log(0) issues.
# If any duration is 0, the log operation will fail, so we ensure min duration is 1 minute.
clean_df['log_outage_duration'] = np.log(clean_df['outage_duration'].clip(lower=1))
# --- 2. Plot Raw Duration (Highly Skewed) ---
fig_raw_outage_duration = px.histogram(
clean_df,
x='outage_duration',
title='Raw Outage Duration Distribution (Minutes)',
nbins=100,
height=400,
labels={'outage_duration': 'Outage Duration (Minutes)'}
)
fig_raw_outage_duration.show()
fig_raw_outage_duration.write_html("plots/raw_outage_duration.html")
# --- 3. Plot Log-Transformed Duration (Closer to Normal) ---
fig_log_outage_duration = px.histogram(
clean_df,
x='log_outage_duration',
title='Log-Transformed Outage Duration Distribution',
nbins=100,
height=400,
labels={'log_outage_duration': 'Log(Outage Duration)'}
)
fig_log_outage_duration.show()
fig_log_outage_duration.write_html("plots/log_outage_duration.html")
fig_scatter_customers_total_vs_affected = px.scatter(
df,
x='total_customers',
y='customers_affected',
log_x=True,
log_y=True,
title='Relationship Between Total Customers and Customers Affected (Log Scale)',
labels={
"total_customers": "Total Customers (Log Scale)",
"customers_affected": "Customers Affected (Log Scale)",
}
)
# 1. Define custom ticks for the X-axis (Total Customers) to prevent crowding.
x_tick_values = [200000, 500000, 1000000, 2000000, 5000000, 10000000, 20000000]
# 2. Define custom labels for the X-axis ticks (for cleaner display, e.g., '200K', '1M', '10M')
x_tick_labels = ['200K', '500K', '1M', '2M', '5M', '10M', '20M']
fig_scatter_customers_total_vs_affected.update_layout(
# Update X-axis properties
xaxis=dict(
tickvals=x_tick_values,
ticktext=x_tick_labels,
tickfont=dict(size=10),
title_font=dict(size=14)
),
# Update Y-axis properties
yaxis=dict(
tickfont=dict(size=10),
title_font=dict(size=14)
),
# General updates
showlegend=False,
title_font=dict(size=16)
)
fig_scatter_customers_total_vs_affected.update_traces(marker=dict(opacity=0.7, size=6)) # Added a fixed size to the marker
fig_scatter_customers_total_vs_affected.show()
fig_scatter_customers_total_vs_affected.write_html("plots/customers_total_vs_affected.html")
fig_scatter_duration_vs_affected = px.scatter(
df,
x='customers_affected',
y='outage_duration',
log_x=True, # Log scale for magnitude
log_y=True, # Log scale for duration
title='Outage Magnitude vs. Duration by Cause Category (Log-Log Scale)',
labels={
"customers_affected": "Customers Affected (Log Scale)",
"outage_duration": "Outage Duration (minutes, Log Scale)",
}
)
# Customize layout for better readability and presentation
fig_scatter_duration_vs_affected.update_traces(marker=dict(opacity=0.7))
fig_scatter_duration_vs_affected.update_layout(
xaxis_tickformat=',.0f',
yaxis_tickformat=',.0f',
showlegend=True,
)
fig_scatter_duration_vs_affected.show()
fig_scatter_duration_vs_affected.write_html("plots/duration_vs_affected.html")
# 1. Define the grouping column and the columns to aggregate
grouping_col = 'nerc_region' #
metric_cols = ['outage_duration', 'customers_affected', 'demand_loss_mw']
# 2. Perform the grouping and calculate the mean (average)
average_severity_by_region = df.groupby(grouping_col)[metric_cols].mean().reset_index()
# 3. Rename columns for clarity
average_severity_by_region.columns = [
'NERC_REGION',
'Avg_Outage_Duration_min',
'Avg_Customers_Affected',
'Avg_Demand_Loss_MW'
]
# Display the result
print("--- Average Severity Metrics by NERC Region ---")
print(average_severity_by_region)
--- Average Severity Metrics by NERC Region ---
NERC_REGION Avg_Outage_Duration_min Avg_Customers_Affected \
0 ECAR 5603.31 260623.68
1 FRCC 4268.73 352978.89
2 FRCC, SERC 372.00 NaN
3 HECO 895.33 126728.67
4 HI 1367.00 294000.00
5 MRO 2934.95 88984.97
6 NPCC 3262.58 111137.26
7 PR 174.00 62000.00
8 RFC 3485.76 128631.37
9 SERC 1735.45 112447.97
10 SPP 2693.77 194674.97
11 TRE 2799.27 228424.51
12 WECC 1371.53 131209.17
Avg_Demand_Loss_MW
0 1394.48
1 1004.12
2 NaN
3 466.67
4 1060.00
5 207.37
6 968.70
7 220.00
8 296.29
9 582.14
10 142.00
11 635.62
12 507.00
pivot = df.pivot_table(
values='anomaly_level', # pretty sure column chosen doesnt matter for count
index='us_state',
columns='cause_category',
aggfunc='count',
fill_value=0
)
pivot.head()
| cause_category | equipment failure | fuel supply emergency | intentional attack | islanding | public appeal | severe weather | system operability disruption |
|---|---|---|---|---|---|---|---|
| us_state | |||||||
| Alabama | 0 | 0 | 1 | 0 | 0 | 4 | 0 |
| Arizona | 4 | 0 | 15 | 0 | 0 | 3 | 2 |
| Arkansas | 1 | 0 | 6 | 1 | 7 | 10 | 0 |
| California | 21 | 10 | 24 | 28 | 9 | 66 | 39 |
| Colorado | 0 | 0 | 5 | 1 | 0 | 4 | 4 |
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(average_severity_by_region)
NERC_REGION Avg_Outage_Duration_min Avg_Customers_Affected \
0 ECAR 5603.31 260623.68
1 FRCC 4268.73 352978.89
2 FRCC, SERC 372.00 NaN
3 HECO 895.33 126728.67
4 HI 1367.00 294000.00
5 MRO 2934.95 88984.97
6 NPCC 3262.58 111137.26
7 PR 174.00 62000.00
8 RFC 3485.76 128631.37
9 SERC 1735.45 112447.97
10 SPP 2693.77 194674.97
11 TRE 2799.27 228424.51
12 WECC 1371.53 131209.17
Avg_Demand_Loss_MW
0 1394.48
1 1004.12
2 NaN
3 466.67
4 1060.00
5 207.37
6 968.70
7 220.00
8 296.29
9 582.14
10 142.00
11 635.62
12 507.00
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
Step 3: Assessment of Missingness¶
Permutation Test¶
Analyzing the dependency of missingness in demand_loss_mw with cause_category
# flag if demand_loss_mw is missing
df['is_demand_missing'] = df['demand_loss_mw'].isna().astype(int)
# Compute the actual observed statistic: variation in missingness rates across categories
missing_by_cause = df.groupby('cause_category')['is_demand_missing'].mean()
# Observed test statistic: difference between highest and lowest missingness rate
obs_diff = missing_by_cause.max() - missing_by_cause.min()
obs_diff
np.float64(0.43286713286713285)
df['cause_category'] = df['cause_category'].fillna("Unknown")
missing_by_cause = df.groupby('cause_category')['is_demand_missing'].mean()
fig_bar_missingness_plot = px.bar(
missing_by_cause.reset_index(),
x='is_demand_missing',
y='cause_category',
orientation='h',
labels={'is_demand_missing': 'Missingness Rate'},
title="Missingness in demand_loss_mw by Cause Category"
)
fig_bar_missingness_plot.show()
fig_bar_missingness_plot.write_html("plots/missingness_plot.html")
n_permutations = 5000
permuted_diffs = []
for _ in range(n_permutations):
shuffled = np.random.permutation(df['is_demand_missing'])
perm_df = df.copy()
perm_df['shuffled_missing'] = shuffled
diff = (
perm_df.groupby('cause_category')['shuffled_missing'].mean().max() -
perm_df.groupby('cause_category')['shuffled_missing'].mean().min()
)
permuted_diffs.append(diff)
permuted_diffs = np.array(permuted_diffs)
# p-value: proportion of permutations more extreme than observed
p_value = np.mean(permuted_diffs >= obs_diff)
p_value
np.float64(0.0004)
# Histogram of permutation distribution
fig_hist_permutation_test_plot = go.Figure()
fig_hist_permutation_test_plot.add_trace(go.Histogram(
x=permuted_diffs,
nbinsx=30,
histnorm='probability density',
opacity=0.7,
name='Permutation Distribution'
))
# Vertical line for observed test statistic
fig_hist_permutation_test_plot.add_trace(go.Scatter(
x=[obs_diff, obs_diff],
y=[0, max(np.histogram(permuted_diffs, bins=30, density=True)[0])],
mode='lines',
line=dict(color='red', width=3),
name=f'Observed Diff = {obs_diff:.4f}'
))
# Layout settings
fig_hist_permutation_test_plot.update_layout(
title="Permutation Test: Missingness of demand_loss_mw vs Cause Category",
xaxis_title="Permutation Test Statistic (Max Diff in Missingness)",
yaxis_title="Density",
bargap=0.05
)
# Save as interactive HTML
fig_hist_permutation_test_plot.show()
fig_hist_permutation_test_plot.write_html("plots/permutation_test_plot.html")
Analyzing the dependency of missingness in demand_loss_mw with month
# Ensure month is clean (no NaNs)
df['month'] = df['month'].fillna("Unknown")
# Missingness rate by year
missing_by_month = df.groupby('month')['is_demand_missing'].mean()
print(missing_by_month)
month 1.0 0.49 2.0 0.44 3.0 0.51 4.0 0.43 5.0 0.36 6.0 0.51 7.0 0.51 8.0 0.54 9.0 0.36 10.0 0.49 11.0 0.35 12.0 0.40 Name: is_demand_missing, dtype: float64
fig_bar_missingness_by_month = px.bar(
missing_by_month.reset_index(),
x='is_demand_missing',
y='month',
orientation='h',
labels={'is_demand_missing': 'Missingness Rate'},
title="Missingness in demand_loss_mw by Month"
)
fig_bar_missingness_by_month.show()
fig_bar_missingness_by_month.write_html("plots/missingness_by_month.html")
obs_diff = missing_by_month.max() - missing_by_month.min()
print("Observed difference in missingness rates:", obs_diff)
Observed difference in missingness rates: 0.19579807411730377
n_permutations = 5000
permuted_diffs = []
for _ in range(n_permutations):
shuffled = np.random.permutation(df['is_demand_missing'])
perm_df = df.copy()
perm_df['shuffled_missing'] = shuffled
diff = (
perm_df.groupby('month')['shuffled_missing'].mean().max() -
perm_df.groupby('month')['shuffled_missing'].mean().min()
)
permuted_diffs.append(diff)
permuted_diffs = np.array(permuted_diffs)
# p-value: proportion of permutations more extreme than observed
p_value = np.mean(permuted_diffs >= obs_diff)
print("Permutation test p-value:", p_value)
Permutation test p-value: 0.133
fig_hist_permutation_test_by_month = go.Figure()
fig_hist_permutation_test_by_month.add_trace(go.Histogram(
x=permuted_diffs,
nbinsx=30,
histnorm='probability density',
opacity=0.7,
name='Permutation Distribution'
))
# Vertical line for observed statistic
fig_hist_permutation_test_by_month.add_trace(go.Scatter(
x=[obs_diff, obs_diff],
y=[0, max(np.histogram(permuted_diffs, bins=30, density=True)[0])],
mode='lines',
line=dict(color='red', width=3),
name=f'Observed Diff = {obs_diff:.4f}'
))
fig_hist_permutation_test_by_month.update_layout(
title="Permutation Test: Missingness of demand_loss_mw vs Month",
xaxis_title="Permutation Test Statistic (Max Diff in Missingness)",
yaxis_title="Density",
bargap=0.05
)
fig_hist_permutation_test_by_month.show()
fig_hist_permutation_test_by_month.write_html("plots/permutation_test_by_month.html")
Step 4: Hypothesis Testing¶
Hypothesis 1: Testing if outages caused by severe weather have longer average durations (compared to other causes)
H0: Outages caused by Severe Weather do not have a statistically significantly longer average duration than outages caused by other sources (mean duration of severe weather outages is the same as mean duration of outages of other sources)
HA: Outages caused by Severe Weather have a statistically significantly longer average duration than outages caused by other sources (mean duration of severe weather outages is greater than mean duration of outages of other sources)
# Create a binary column for the two groups
df['is_severe_weather'] = (df['cause_category'] == 'severe weather').astype(int)
# --- A. Calculate the Observed Difference in Means ---
df['log_outage_duration'] = np.log(df['outage_duration'].clip(lower=1)) # redo log outage idk y not saved
obs_mean_severe = df[df['is_severe_weather'] == 1]['log_outage_duration'].mean()
obs_mean_other = df[df['is_severe_weather'] == 0]['log_outage_duration'].mean()
obs_diff = obs_mean_severe - obs_mean_other
print(f"Observed Mean Log Duration (Severe Weather): {obs_mean_severe:.3f}")
print(f"Observed Mean Log Duration (Other Sources): {obs_mean_other:.3f}")
print(f"Observed Difference in Means (Test Statistic): {obs_diff:.3f}")
Observed Mean Log Duration (Severe Weather): 7.417 Observed Mean Log Duration (Other Sources): 4.301 Observed Difference in Means (Test Statistic): 3.115
# --- B. Permutation Procedure ---
n_repetitions = 5000 # Use 5,000 to ensure a robust null distribution
simulated_diffs = []
# Get the full column of labels (Severe Weather or Other)
labels = df['is_severe_weather'].values
for _ in range(n_repetitions):
# 1. Shuffle the labels (Permutation)
np.random.shuffle(labels)
# 2. Assign the shuffled labels back to a temporary Series
shuffled_is_severe_weather = pd.Series(labels, index=df.index)
# 3. Calculate the mean difference for this shuffled sample
mean_severe_shuffled = df[shuffled_is_severe_weather == 1]['log_outage_duration'].mean()
mean_other_shuffled = df[shuffled_is_severe_weather == 0]['log_outage_duration'].mean()
simulated_diffs.append(mean_severe_shuffled - mean_other_shuffled)
# Convert to a NumPy array for easy calculation
simulated_diffs = np.array(simulated_diffs)
# --- C. Calculate P-Value ---
# Since HA is > (one-tailed test), we count how often the simulated difference is >= the observed difference
p_value = (simulated_diffs >= obs_diff).mean()
print(f"\nP-Value: {p_value:.5f}")
# --- D. Visualization ---
# 1. Calculate histogram bins and counts manually to find the max height
counts, edges = np.histogram(simulated_diffs, bins=50)
max_count = counts.max()
# 2. Create the histogram (fig.data[0].y will now contain the counts)
fig_hist_hypothesis_test = px.histogram(x=simulated_diffs, nbins=50,
title='Null Distribution of Mean Difference (Permutation Test)',
labels={'x': 'Simulated Difference in Mean Log Duration'})
# 3. Add a vertical line for the observed test statistic
# We use the pre-calculated max_count (with a 5% buffer) for the y-axis height
fig_hist_hypothesis_test.add_trace(go.Scatter(x=[obs_diff, obs_diff],
y=[0, max_count * 1.05], # Use 1.05 factor to go slightly above the max bar
mode='lines',
line=dict(color='red', width=3, dash='dot'),
name=f'Observed Diff = {obs_diff:.3f}'))
fig_hist_hypothesis_test.update_layout(showlegend=True) # Ensure the line is visible in the legend
fig_hist_hypothesis_test.show()
fig_hist_hypothesis_test.write_html("plots/hypothesis_test.html")
P-Value: 0.00000
With a p-value of about 0.00000, we can reject the null hypothesis, as we have strong evidence that the outages caused by severe weather have statistically longer average durations than outages caused by all other sources. This reflects our real-world intuition that outages caused by severe weather are often more difficult to address, taking more time to locate, travel to, and repair, compared to other outage types.
Hypothesis 2: Testing if richer states (higher GDP per capita) have a lower average log power outage duration than states with a lower GDP per capita. In essence, richer states experience shorter power outages than states with lower GDP per capita.
H0: There is no linear relationship between a state's GDP per capita and its average log power outage duration.
HA: There is a negative linear relationship between a state's GDP per capita and its average log power outage duration.
# --- 1. Aggregation and Transformation ---
# new col for per capita real gross state product
df['pc_realgsp_state'] = df['total_realgsp'] / df['population']
# Filter out non-impactful outages (customers_affected > 0)
df_no_0_customers_affected = df[df['customers_affected'] > 0]
# A. Aggregate the data by State and Year
df_agg = df_no_0_customers_affected.groupby(['us_state', 'year']).agg(
# (Y): Mean Log Duration
mean_log_duration=('log_outage_duration', 'mean'),
# Predictor Variable (X): Mean Per Capita GDP (since it's constant for state/year)
mean_pc_realgsp_state=('pc_realgsp_state', 'mean'),
).reset_index()
# B. Log-Transform the final predictor (X)
# This uses log(Per Capita GDP) to test for proportional relationships
df_agg['log_pc_realgsp_state'] = np.log(df_agg['mean_pc_realgsp_state'].clip(lower=1e-5))
# --- 2. Regression Test ---
# Hypothesis: H0: B1 = 0, HA: B1 < 0 (Negative relationship expected)
# Model: Mean_log_duration (Y) = Beta_0 + Beta_1 * log(Per Capita GDP) + error
model = smf.ols('mean_log_duration ~ log_pc_realgsp_state', data=df_agg).fit()
# --- 3. Print the Summary ---
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: mean_log_duration R-squared: 0.000
Model: OLS Adj. R-squared: -0.003
Method: Least Squares F-statistic: 0.06931
Date: Thu, 11 Dec 2025 Prob (F-statistic): 0.793
Time: 11:57:32 Log-Likelihood: -602.83
No. Observations: 311 AIC: 1210.
Df Residuals: 309 BIC: 1217.
Df Model: 1
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
Intercept 7.2318 1.340 5.397 0.000 4.595 9.868
log_pc_realgsp_state 0.1155 0.439 0.263 0.793 -0.748 0.979
==============================================================================
Omnibus: 80.486 Durbin-Watson: 1.514
Prob(Omnibus): 0.000 Jarque-Bera (JB): 176.522
Skew: -1.288 Prob(JB): 4.66e-39
Kurtosis: 5.643 Cond. No. 47.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Using the p-value of the coefficient (log_pc_realgsp_state) we fail to reject the null hypothesis at the 0.05 significance level, obtaining a p-value of 0.793. Our coefficient is also in the opposite direction than what we would have if the alternative hypothesis were true, getting a positive +0.1155 coefficient instead of a negative one. Since this model fit also had a very low R^2 value (0.000), not much variation is explained by GDP per capita, with other predictors being more significant.
Step 5: Framing a Prediction Problem¶
We want to predict outage duration without knowing the exact cause of the outage. This can help us predict how long the power would be out (as people typically unaware of the cause), which can help in civilian applications or other similar situations. In this problem, we want to predict log_outage_duration (since the normal outage_duration has a heavy right skew) and models typically perform better on approximately normally distributed data. In the first (baseline) model, we will use all columns as predictors, excluding 'log_outage_duration', 'outage_duration', 'cause_category', and 'is_severe_weather'. We will use a pipeline, incorperating train-test split with cross-validation, standard scaling, and one-hot encoding for categorical variables, to fit a supervised, multiple-linear regression model. To evaluate the performance of this model, we will be using root mean squared error, as well as R^2 to measure how much variance our model explains.
Step 6: Baseline Model¶
# --- 0. Essential Filtering (Only Target/Zero-Impact rows are dropped) ---
prev_df_len = len(df)
# 1. Ensure the Target Variable is valid (not NaN and not log(0))
df_filtered = df[df['log_outage_duration'].notna()]
df_filtered = df_filtered[df_filtered['outage_duration'] > 0]
# keep NaNs so we can impute, but not 0s because we wouldn't notice an outage if 0 customers affected
df_filtered = df_filtered[(df_filtered['customers_affected'] != 0)]
df = df_filtered
print(f"Data filtered. Retained {len(df)} of {prev_df_len} rows for modeling.")
Data filtered. Retained 1246 of 1467 rows for modeling.
# --- 1. Preprocessing Setup ---
df['start_hour'] = df['outage_start_datetime'].dt.hour.astype('category')
# Define the columns
numeric_features = [
'total_price', 'total_sales',
'total_customers',
'population', 'poppct_urban', 'popden_urban', 'areapct_urban',
'util_realgsp', 'util_contri',
'pc_realgsp_state', 'total_realgsp'
]
categorical_features = [
'month', 'year', 'start_hour',
'us_state',
'nerc_region', 'climate_region', 'climate_category',
'anomaly_level',
'is_demand_missing',
'is_severe_weather'
]
excluded_cols = [
'log_outage_duration', 'outage_duration',
'outage_restoration_datetime', 'outage_start_datetime',
'demand_loss_mw',
'cause_category', 'customers_affected'# likely unknown to affected individuals
]
# --- 2. Train/Test Split ---
X = df.drop(columns=excluded_cols, errors='ignore')
y = df['log_outage_duration']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=67
)
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
Training set size: 996 Testing set size: 250
# --- 3. Robust Preprocessing Pipeline Setup (Handles remaining NaNs) ---
preprocessor = ColumnTransformer(
transformers=[
# Numerical Pipeline: Impute NaNs with median, then Scale
('num',
Pipeline([
('imputer', SimpleImputer(strategy='median', add_indicator=True)), # Imputes remaining NaNs
('scaler', StandardScaler())
]),
numeric_features),
# Categorical Pipeline: Handle unknown categories, then One-Hot Encode
('cat',
OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False),
categorical_features)
],
remainder='drop' # get rid of stuff outside of predictors we're using
)
# --- 4. Baseline Model Pipeline Creation and Fitting ---
baseline_model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', Ridge(alpha=1.0))
])
# --- 5. Model Pipeline Fitting ---
baseline_model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(add_indicator=True,
strategy='median')),
('scaler',
StandardScaler())]),
['total_price', 'total_sales',
'total_customers',
'population', 'poppct_urban',
'popden_urban',
'areapct_urban',
'util_realgsp',
'util_contri',
'pc_realgsp_state',
'total_realgsp']),
('cat',
OneHotEncoder(drop='first',
handle_unknown='ignore',
sparse_output=False),
['month', 'year',
'start_hour', 'us_state',
'nerc_region',
'climate_region',
'climate_category',
'anomaly_level',
'is_demand_missing',
'is_severe_weather'])])),
('regressor', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(add_indicator=True,
strategy='median')),
('scaler',
StandardScaler())]),
['total_price', 'total_sales',
'total_customers',
'population', 'poppct_urban',
'popden_urban',
'areapct_urban',
'util_realgsp',
'util_contri',
'pc_realgsp_state',
'total_realgsp']),
('cat',
OneHotEncoder(drop='first',
handle_unknown='ignore',
sparse_output=False),
['month', 'year',
'start_hour', 'us_state',
'nerc_region',
'climate_region',
'climate_category',
'anomaly_level',
'is_demand_missing',
'is_severe_weather'])])),
('regressor', Ridge())])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(add_indicator=True,
strategy='median')),
('scaler', StandardScaler())]),
['total_price', 'total_sales',
'total_customers', 'population',
'poppct_urban', 'popden_urban',
'areapct_urban', 'util_realgsp',
'util_contri', 'pc_realgsp_state',
'total_realgsp']),
('cat',
OneHotEncoder(drop='first',
handle_unknown='ignore',
sparse_output=False),
['month', 'year', 'start_hour', 'us_state',
'nerc_region', 'climate_region',
'climate_category', 'anomaly_level',
'is_demand_missing', 'is_severe_weather'])])['total_price', 'total_sales', 'total_customers', 'population', 'poppct_urban', 'popden_urban', 'areapct_urban', 'util_realgsp', 'util_contri', 'pc_realgsp_state', 'total_realgsp']
SimpleImputer(add_indicator=True, strategy='median')
StandardScaler()
['month', 'year', 'start_hour', 'us_state', 'nerc_region', 'climate_region', 'climate_category', 'anomaly_level', 'is_demand_missing', 'is_severe_weather']
OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
Ridge()
# --- 6. Evaluate Train and Test Scores ---
print("\n--- Base Model Evaluation ---")
# A. Calculate and print Train Scores
y_train_pred = baseline_model_pipeline.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Train R-squared Score (R2): {train_r2:.4f}")
print(f"Train Root Mean Squared Error (RMSE): {train_rmse:.4f}")
# B. Calculate and print Test Scores
y_test_pred = baseline_model_pipeline.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("\n--- Final Test Set Evaluation ---")
print(f"Test R-squared Score (R2): {test_r2:.4f}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse:.4f}")
# Real-scale metrics
y_pred_mins = np.exp(y_test_pred)
y_test_mins = np.exp(y_test)
mae_mins = mean_absolute_error(y_test_mins, y_pred_mins)
medae_mins = median_absolute_error(y_test_mins, y_pred_mins)
print(f"\nReal-Scale Performance:")
print(f"Mean Absolute Error: {mae_mins:.2f} mins")
print(f"Median Absolute Error: {medae_mins:.2f} mins")
--- Base Model Evaluation --- Train R-squared Score (R2): 0.4486 Train Root Mean Squared Error (RMSE): 1.7970 --- Final Test Set Evaluation --- Test R-squared Score (R2): 0.2180 Test Root Mean Squared Error (RMSE): 2.0450 Real-Scale Performance: Mean Absolute Error: 2673.73 mins Median Absolute Error: 918.21 mins
Step 7: Final Model¶
New Features¶
is_weekend (Binary Feature)
- Derived from: outage_start_datetime.dayofweek in [5, 6]
- Rationale: Repair crew availability differs on weekends vs weekdays. Weekend staffing is typically reduced, potentially leading to longer restoration times. This captures day-of-week patterns not present in the 'start_hour' feature.
year_month (Categorical Feature)
- Derived from: Combining year + month as "YYYY-MM"
- Rationale: Infrastructure quality, repair technology, and utility preparedness improve over time in seasonal patterns. A combined year-month feature captures temporal trends that separate year and month features cannot represent (e.g., winter 2015 vs winter 2010).
is_peak_hour (Binary Feature)
- Derived from: start_hour in [7, 8, 9, 17, 18, 19]
- Rationale: Outages during peak demand hours (morning/evening commute) may receive different prioritization or face different complexity due to system strain. Creates a meaningful threshold from continuous hour.
econ_stress_region (Binary Feature)
- Derived from: util_contri > median(util_contri)
- Rationale: Regions where utilities contribute disproportionately to the economy may have different infrastructure investment patterns or repair prioritization policies.
Hyperparameter Tuning¶
n_estimators: [100, 150, 200] Rationale: More trees provide more stable predictions but increase computation time. Testing range to find optimal stability/speed tradeoff.
max_features: [0.4, 0.5, 0.6, 0.7] Rationale: Controls feature interaction and overfitting. Lower values force more diversity between trees. Our baseline showed overfitting, so we test restricted values to improve generalization.
min_samples_split: [10, 15, 20] Rationale: Prevents splitting on small samples that may not generalize. Higher values reduce overfitting by requiring more evidence before splits.
min_samples_leaf: [2, 4, 6] Rationale: Ensures leaf nodes have minimum samples. Higher values create smoother decision boundaries and reduce overfitting. Testing to find optimal regularization strength.
Method: GridSearchCV with 3-fold cross-validation Total combinations: 3 × 4 × 3 × 3 = 108
# --- 1. Feature Engineering Function ---
def create_engineered_features(X_data, df_original):
"""
Create 5 new engineered features from existing data.
Parameters:
-----------
X_data : DataFrame
Feature matrix (train or test)
df_original : DataFrame
Original dataframe with outage_start_datetime
Returns:
--------
X_eng : DataFrame
Feature matrix with new engineered features added
"""
X_eng = X_data.copy()
# Get datetime column for these specific indices
datetime_col = df_original.loc[X_data.index, 'outage_start_datetime']
# Feature 1: Weekend indicator (Saturday=5, Sunday=6)
X_eng['is_weekend'] = datetime_col.dt.dayofweek.isin([5, 6]).astype(int)
# Feature 2: Year-Month combination for temporal trends
X_eng['year_month'] = (datetime_col.dt.year.astype(str) + '-' +
datetime_col.dt.month.astype(str).str.zfill(2))
# Feature 3: Peak hour indicator (morning/evening rush)
X_eng['is_peak_hour'] = X_data['start_hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
# Feature 4: Economic stress region (utility contribution above median)
median_contri = X_data['util_contri'].median()
X_eng['econ_stress_region'] = (X_data['util_contri'] > median_contri).fillna(False).astype(int)
return X_eng
# --- 2. Create start_hour if not already present ---
df['start_hour'] = df['outage_start_datetime'].dt.hour.astype('category')
# --- 3. Define Features (same as baseline) ---
numeric_features = [
'total_price', 'total_sales',
'total_customers',
'population', 'poppct_urban', 'popden_urban', 'areapct_urban',
'util_realgsp', 'util_contri',
'pc_realgsp_state', 'total_realgsp'
]
categorical_features = [
'month', 'start_hour',
'us_state',
'nerc_region', 'climate_region', 'climate_category',
'anomaly_level',
'is_demand_missing',
'is_severe_weather'
]
excluded_cols = [
'log_outage_duration', 'outage_duration',
'outage_restoration_datetime', 'outage_start_datetime',
'demand_loss_mw',
'cause_category', 'customers_affected',
'year' # removed from cat features bc adding year_month
]
# --- 4. Apply Feature Engineering (keep original train test split)---
X_train_final = create_engineered_features(X_train, df)
X_test_final = create_engineered_features(X_test, df)
print("\nAll 4 engineered features created")
All 4 engineered features created
# --- 6. Update Feature Lists ---
final_numeric_features = numeric_features + [
'is_weekend', 'is_peak_hour', 'econ_stress_region'
]
final_categorical_features = categorical_features + ['year_month']
# --- 7. Build Final Preprocessing Pipeline ---
final_preprocessor = ColumnTransformer(
transformers=[
('num',
Pipeline([
('imputer', SimpleImputer(strategy='median', add_indicator=True)),
('scaler', StandardScaler())
]),
final_numeric_features),
('cat',
OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False),
final_categorical_features)
],
remainder='drop'
)
# NOTE: Assuming X_train_final, X_test_final, and y_train/y_test are already defined
# and the final_preprocessor ColumnTransformer is defined as in your previous step.
# --- 1. FIT PREPROCESSOR and TRANSFORM DATA ---
# FIX: Must fit the preprocessor on the training data first!
final_preprocessor.fit(X_train_final)
X_train_processed = final_preprocessor.transform(X_train_final)
X_test_processed = final_preprocessor.transform(X_test_final)
# --- 2. DEFINE GAM FORMULA (Dynamic Construction) ---
# Base formula starts with the smooth (s) and linear (l) terms
GAM_FORMULA = s(0) + s(4, 5) + \
s(1) + s(2) + s(3) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11)
# Dynamically add the Factor (f) terms for all one-hot encoded columns
START_INDEX_CAT = 12
NUM_TOTAL_COLS = X_train_processed.shape[1]
for i in range(START_INDEX_CAT, NUM_TOTAL_COLS):
# Sum individual factor terms: f(12) + f(13) + f(14) + ...
GAM_FORMULA += f(i)
print(f"GAM Formula successfully compiled with {NUM_TOTAL_COLS} total feature columns.")
# --- 3. MANUAL ITERATIVE HYPERPARAMETER TUNING (Lambda Search) ---
LAMBDA_CANDIDATES = [0.1, 1.0, 10.0, 100.0]
best_lambda = None
best_rmse = float('inf')
best_gam = None
print("\n--- Starting Manual Iterative Tuning (Lambda Search) ---")
for lam in LAMBDA_CANDIDATES:
gam = LinearGAM(
terms=GAM_FORMULA,
lam=lam,
fit_intercept=True
).fit(X_train_processed, y_train) # CORRECTED: Use X_train_processed
y_test_pred_gam = gam.predict(X_test_processed) # CORRECTED: Use X_test_processed
current_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_gam))
if current_rmse < best_rmse:
best_rmse = current_rmse
best_lambda = lam
best_gam = gam
print(f"Lambda={lam:.1f} Test RMSE: {current_rmse:.4f}")
# --- 4. FINAL MODEL EVALUATION ---
final_rmse = np.sqrt(mean_squared_error(y_test, best_gam.predict(X_test_processed)))
final_r2 = best_gam.score(X_test_processed, y_test)
print("\n--- Final Model (Tuned GAM) Test Set Evaluation ---")
print(f"**Tuned Hyperparameter:** Lambda={best_lambda}")
print(f"Baseline Test RMSE: 1.9804")
print(f"Final Model Test RMSE: {final_rmse:.4f}")
print(f"Final Model Test R2: {final_r2:.4f}")
c:\Users\Jonat\miniforge3\envs\dsc80-fa25\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [9] during transform. These unknown categories will be encoded as all zeros
GAM Formula successfully compiled with 316 total feature columns. --- Starting Manual Iterative Tuning (Lambda Search) --- Lambda=0.1 Test RMSE: 2.0873 Lambda=1.0 Test RMSE: 2.0544 Lambda=10.0 Test RMSE: 2.0285 Lambda=100.0 Test RMSE: 2.0305 --- Final Model (Tuned GAM) Test Set Evaluation --- **Tuned Hyperparameter:** Lambda=10.0 Baseline Test RMSE: 1.9804 Final Model Test RMSE: 2.0285 Final Model Test R2: 0.2306
Random Forest Regressor (ACTUAL FINAL MODEL)
# --- 8. Hyperparameter Tuning with GridSearchCV ---
print("\n" + "="*70)
print("Hyperparameter Tuning via GridSearchCV")
print("="*70)
param_grid = {
'regressor__n_estimators': [100, 150, 200],
'regressor__max_features': [0.4, 0.5, 0.6, 0.7],
'regressor__min_samples_split': [10, 15, 20],
'regressor__min_samples_leaf': [2, 4, 6],
}
print(f"Testing {3*4*3*3} = 108 parameter combinations with 3-fold CV")
grid_search = GridSearchCV(
estimator=Pipeline(steps=[
('preprocessor', final_preprocessor),
('regressor', RandomForestRegressor(random_state=67, n_jobs=-1))
]),
param_grid=param_grid,
scoring='r2',
cv=3,
n_jobs=-1,
verbose=1
)
# Fit on training data (CV happens internally)
grid_search.fit(X_train_final, y_train)
print(f"\nBest Cross-Validation Score: {grid_search.best_score_:.4f}")
print(f"Best Parameters Found:")
for param, value in grid_search.best_params_.items():
print(f" {param}: {value}")
# --- 9. Extract Best Model (already trained on full training set) ---
final_model = grid_search.best_estimator_
====================================================================== Hyperparameter Tuning via GridSearchCV ====================================================================== Testing 108 = 108 parameter combinations with 3-fold CV Fitting 3 folds for each of 108 candidates, totalling 324 fits Best Cross-Validation Score: 0.3073 Best Parameters Found: regressor__max_features: 0.4 regressor__min_samples_leaf: 4 regressor__min_samples_split: 10 regressor__n_estimators: 150
# --- 10. Evaluate on Test Set ---
print("\n" + "="*70)
print("Final Model Evaluation on Test Set")
print("="*70)
y_test_pred = final_model.predict(X_test_final)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test R²: {test_r2:.4f}")
print(f"Test RMSE (log-scale): {test_rmse:.4f}")
# Real-scale metrics
y_pred_mins = np.exp(y_test_pred)
y_test_mins = np.exp(y_test)
mae_mins = mean_absolute_error(y_test_mins, y_pred_mins)
medae_mins = median_absolute_error(y_test_mins, y_pred_mins)
print(f"\nReal-Scale Performance:")
print(f"Mean Absolute Error: {mae_mins:.2f} mins")
print(f"Median Absolute Error: {medae_mins:.2f} mins")
====================================================================== Final Model Evaluation on Test Set ====================================================================== Test R²: 0.3690 Test RMSE (log-scale): 1.8371 Real-Scale Performance: Mean Absolute Error: 2118.95 mins Median Absolute Error: 741.32 mins
c:\Users\Jonat\miniforge3\envs\dsc80-fa25\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [9] during transform. These unknown categories will be encoded as all zeros
# --- 11. Feature Importance ---
print("\n" + "="*70)
print("TOP 15 MOST IMPORTANT FEATURES")
print("="*70)
# Get the fitted preprocessor from the pipeline
fitted_preprocessor = final_model.named_steps['preprocessor']
# Extract feature names after transformation
feature_names = (
fitted_preprocessor.named_transformers_['num']
.named_steps['imputer'].get_feature_names_out(final_numeric_features).tolist() +
fitted_preprocessor.named_transformers_['cat']
.get_feature_names_out(final_categorical_features).tolist()
)
importances = final_model.named_steps['regressor'].feature_importances_
feature_importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values('importance', ascending=False)
print(feature_importance_df.head(15).to_string(index=False))
# Show engineered feature importance
print("\n" + "="*70)
print("ENGINEERED FEATURE IMPORTANCE")
print("="*70)
eng_features = feature_importance_df[
feature_importance_df['feature'].str.contains('weekend|peak_hour|high_impact|econ_stress|year_month', case=False)
]
print(eng_features.head(10).to_string(index=False))
======================================================================
TOP 15 MOST IMPORTANT FEATURES
======================================================================
feature importance
total_sales 0.11
util_realgsp 0.11
util_contri 0.09
total_price 0.08
total_customers 0.08
population 0.06
total_realgsp 0.05
nerc_region_WECC 0.04
areapct_urban 0.04
pc_realgsp_state 0.03
popden_urban 0.02
poppct_urban 0.02
is_demand_missing_1 0.01
us_state_Delaware 0.01
climate_category_normal 0.01
======================================================================
ENGINEERED FEATURE IMPORTANCE
======================================================================
feature importance
is_weekend 8.70e-03
year_month_2014-06 8.34e-03
is_peak_hour 8.30e-03
year_month_2014-01 3.13e-03
year_month_2013-08 2.54e-03
year_month_2008-09 2.49e-03
econ_stress_region 2.47e-03
year_month_2006-12 2.42e-03
year_month_2008-06 1.96e-03
year_month_2013-05 1.61e-03
Step 8: Fairness Analysis¶
# --- 1. Define Groups ---
# Calculate median from TRAIN to avoid leakage
median_gsp = X_train['pc_realgsp_state'].median()
print(f"Median per capita GSP (from training): {median_gsp:.2f}")
# Create labels (High GSP vs. Low GSP)
test_groups = (X_test['pc_realgsp_state'] > median_gsp).map({
True: 'High GSP',
False: 'Low GSP'
})
print(f"\nTest set composition:")
print(test_groups.value_counts())
# --- 2. Get Predictions ---
y_test_pred = final_model.predict(X_test_final)
# --- 3. Calculation Function ---
def calculate_rmse_by_group(y_true, y_pred, groups):
"""
Calculate RMSE separately for each group.
Converts inputs to numpy arrays to ensure index alignment does not cause errors.
"""
# 1. Convert everything to numpy arrays to ignore Pandas indices
y_true_np = np.array(y_true)
y_pred_np = np.array(y_pred)
groups_np = np.array(groups)
# 2. Ensure shapes match (flatten if necessary)
y_true_np = y_true_np.ravel()
y_pred_np = y_pred_np.ravel()
results = {}
unique_groups = np.unique(groups_np)
for group_name in unique_groups:
# Boolean masking on numpy arrays works purely by position
mask = (groups_np == group_name)
# Calculate RMSE for this subset
if np.sum(mask) > 0: # Avoid empty groups
mse = mean_squared_error(y_true_np[mask], y_pred_np[mask])
results[group_name] = np.sqrt(mse)
else:
results[group_name] = np.nan
return results
# --- 4. Observed RMSE ---
observed_rmse = calculate_rmse_by_group(y_test, y_test_pred, test_groups)
print(f"Low GSP RMSE: {observed_rmse['Low GSP']:.4f}")
print(f"High GSP RMSE: {observed_rmse['High GSP']:.4f}")
observed_diff = abs(observed_rmse['Low GSP'] - observed_rmse['High GSP'])
print(f"Difference (Low GSP - High GSP): {observed_rmse['Low GSP'] - observed_rmse['High GSP']:.4f}")
print(f"Absolute Difference: {observed_diff:.4f}")
# --- 5. Permutation Test ---
n_permutations = 5000
permuted_diffs = []
np.random.seed(67) # Set seed for reproducibility
# Convert groups to numpy array
groups_array = np.array(test_groups)
for i in range(n_permutations):
# Shuffle the group labels
shuffled_groups = np.random.permutation(groups_array)
# Calculate RMSE
# NOTE: The keys in permuted_metrics will be 'High GSP' and 'Low GSP'
permuted_metrics = calculate_rmse_by_group(y_test, y_test_pred, shuffled_groups)
# Calculate difference
diff = abs(permuted_metrics['Low GSP'] - permuted_metrics['High GSP'])
permuted_diffs.append(diff)
# Calculate p-value
p_value = np.mean(np.array(permuted_diffs) >= observed_diff)
print(f"Permutation test complete.")
print(f"p-value: {p_value:.4f}")
# --- 7. Visualization ---
# Convert to numpy for safe indexing
y_test_np = np.array(y_test).ravel()
y_pred_np = np.array(y_test_pred).ravel()
groups_np = np.array(test_groups)
# Calculate absolute errors
errors = np.abs(y_test_np - y_pred_np)
low_gsp_errors = errors[groups_np == 'Low GSP']
high_gsp_errors = errors[groups_np == 'High GSP']
# --- 8. Plotting ---
# 1. Create subplots object
fig_fairness = make_subplots(
rows=1,
cols=2,
subplot_titles=(
'Permutation Test Distribution',
'Distribution of Prediction Errors by Group (Per Capita GSP)'
)
)
# 2. Plot 1: Histogram (Permutation Distribution)
fig_fairness.add_trace(
go.Histogram(
x=permuted_diffs,
name='Permuted Diff',
marker_color='skyblue',
opacity=0.7
),
row=1, col=1
)
# Add the observed difference vertical line
fig_fairness.add_shape(
type="line",
x0=observed_diff, x1=observed_diff,
y0=0, y1=1,
xref="x1",
yref="paper", # spans full subplot height
line=dict(color="red", width=3),
)
# Update axis for Plot 1
fig_fairness.update_xaxes(title_text="Absolute Difference in RMSE (Permuted)", row=1, col=1)
fig_fairness.update_yaxes(title_text="Frequency", row=1, col=1)
# 3. Plot 2: Box Plot (Error Distribution)
fig_fairness.add_trace(
go.Box(
y=low_gsp_errors,
name='Low GSP',
marker_color='lightblue'
),
row=1, col=2
)
fig_fairness.add_trace(
go.Box(
y=high_gsp_errors,
name='High GSP',
marker_color='coral'
),
row=1, col=2
)
# Update axis for Plot 2
fig_fairness.update_yaxes(title_text="Absolute Error", row=1, col=2)
fig_fairness.update_layout(
showlegend=False,
height=450,
width=900,
title_text="Fairness Analysis: Permutation Test and Error Distribution"
)
# 4. Save the Plotly Figure
fig_fairness.write_html('plots/fairness_analysis_gsp.html')
fig_fairness.show()
Median per capita GSP (from training): 0.05 Test set composition: pc_realgsp_state Low GSP 133 High GSP 117 Name: count, dtype: int64 Low GSP RMSE: 1.9056 High GSP RMSE: 1.7559 Difference (Low GSP - High GSP): 0.1497 Absolute Difference: 0.1497
c:\Users\Jonat\miniforge3\envs\dsc80-fa25\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [9] during transform. These unknown categories will be encoded as all zeros
Permutation test complete. p-value: 0.4872